#################################################################################
#_______________________________ Exchange Shares _______________________________#
#################################################################################

# This script produces the figures and regression outputs describing the
# market shares of the Top 8 exchanges.
#
# "Ex_Daily_Shares_2007_2018.csv" is a day-exchange level dataset with a row
# for the exchange's volume share for that day (e.g. NYSE's share of volume
# on May 8, 2011). "Ex_Weekly_Shares_2007_2018.csv" is a week-exchange level
# dataset with a row for the exchange's volume share for that week (e.g. NYSE's
# share of volume during the week of May 8, 2011).
#
# This dataset also uses "Sym_Ex_Date_Shares_2015.csv" to demonstrate the
# stability of exchange market shares at the symbol-level.
#
# Note that in this script we do not filter symbols. The exchange's share of
# volume is the exchange's share of regular volume for all symbols present
# in the Daily TAQ Trades Data during the observation period.
#
# The output of the script is generated in the following order:
#
# Output from "Ex_Daily_Shares_2007_2018.csv" and  "Ex_Weekly_Shares_2007_2018.csv".
#
# 1. Share of volume (in shares) traded in 2015 for different groups of exchanges
#   (top 5 maker-takers, top take-makers, regional exchanges).
# 2. Plot of weekly exchange shares from 2011 to 2015 for the Top 8 exchanges
# 3. R2 of regression between daily exchange shares and exchange fixed effects
#   between 2011-2015 among the Top 8 exchanges.
#
# Output from "Sym_Ex_Date_Shares_2015.csv".
#
# 4. Regression of daily symbol exchange shares with exchange fixed effects
#   and NYSE listed indicator.

################################################################################
#_______________________________ Load Libraries _______________________________#
################################################################################

library(ggplot2)
library(dplyr)
library(data.table)
library(lubridate)

# Suppress warnings
options(warn=-1)

#############################################################################
#_______________________________ Directories _______________________________#
#############################################################################

home.dir <- "path-to-data-appendix/3_Analysis_of_TAQ_data"
data.dir <- file.path(home.dir, "WRDS_Server/Output")
fig.dir <- file.path(home.dir, "Final_Output/Figures")
table.dir <- file.path(home.dir, "Final_Output/Tables")

###########################################################################
#_______________________________ Functions _______________________________#
###########################################################################

# getAbbr
# Maps the exchange code to a character abbreviation of the exchange name
#
# Args:
#   x: the exchange code
#
# Returns:
#   The exchange abbreviation
#
getAbbr <- function(x){
  name <- "NA"
  if(x == "A") name <- "AMEX"
  if(x == "B") name <- "BX"
  if(x == "N") name <- "NYSE"
  if(x == "P") name <- "ARCA"
  if(x == "C") name <- "NSX"
  if(x == "T") name <- "NSDQ"
  if(x == "D") name <- "FINRA"
  if(x == "X") name <- "PSX"
  if(x == "M") name <- "CHX"
  if(x == "Z") name <- "BZX"
  if(x == "Y") name <- "BYX"
  if(x == "K") name <- "EDGX"
  if(x == "J") name <- "EDGA"
  if(x == "W") name <- "CBOE"
  if(x == "V") name <- "IEX"
  if(x == "I") name <- "ISE"
  return(name)
}
# Vectorize the abbreviation function
getAbbr <- Vectorize(getAbbr)

#_______________________________ Regression Functions _______________________________#

# regression.R2
# Runs a regression between an x and y variable and reports the R2,
# which is calculated manually from the residuals and total sum of squares.
#
# We calculate the R2 manually because when the R2 function from summary.lm
# calculates the total sum of squares it calculates the sum of squares without
# subtracting the mean when the intercept is suppressed.
#
# In other words it calculates sum(exchange_share^2) instead of sum((exchange_share -
# mean(exchange_share))^2).
#
# @returns:
#   R2 of the regression
#
# @args (for reproducing results):
#   data: dataframe, this is the dataset used in the regression.
#     In this context, this dataset is the daily exchange shares from 2011 to 2015.
#   x: string or evector of strings, the name(s) of the dependent variable(s)
# 		in this regression. In this context, these are usually the exchange fixed effects.
#   y: string, the name of the independent variable in this regression.
#     In this context, this is volume share.
#
regression.R2 <- function(data, x, y) {

	# Remove all the rows with missing volume shares
	data_no_NA <- data[!is.na(data[,y]),]

	# If x is a vector, we construct the dependent variables
	# by concatenating with "+". If just a single string,
	# we pass it as is to the formula
	if (length(x) > 1) {
		dependent_vars <- paste(x, collapse = " + ")
	} else {
		dependent_vars <- x
	}

  # Create the formula from the x and y variables specified
  # Suppress the intercept.
  # In the regression for the exchange shares this should be
  # Regular_Sh_S (volume share) ~ 0 + exchange
  formula <- as.formula(paste0(y, " ~ 0 + ", dependent_vars))

  # Run the regression
  model <- lm(formula, data_no_NA)

  # Calculate the R2 manually using the residuals from the regression
  # and the total sum of squares of volume share.
  sm <- summary(model)

  resid <- sm$residuals
  dif_mean <- data_no_NA[,y] - mean(data_no_NA[,y])
  R2 <- 1 - sum(resid^2)/sum(dif_mean^2)

  return(R2)
}

#_______________________________ Plot Functions _______________________________#

# is.labelOverlap
# Returns True if any of the labels are overlapping. If the labels overlap,
# the function "adjust.exchangeLabels"
#
# @returns:
#   TRUE if labels overlap, FALSE if labels do not
#
# @args (for reproducing results):
#   labels: dataframe, this is a dataframe consisting of the last points
#     in the sample period for each exchange. These points identify where
#     to place the labels.
#   vertical_spacing: float, a constant that specifies minimum vertical space
#     between labels.
#   y_variable: string, the y-variable in the plot.
#
is.labelOverlap <- function(labels, vertical_spacing, y_variable){

  # If the y_variable (exchange shares) for the labels are within vertical_spacing
  # of each other, then the points are considered to be overlapping
  labels <- labels[order(-labels[,y_variable]), ]
  for(i in 2:length(labels[,y_variable])){
    if(labels[i - 1, y_variable] - labels[i, y_variable] < vertical_spacing & !is.na(labels[i, y_variable])) {
      return(TRUE)
    }
  }
  return(FALSE)
}

# adjust.exchangeLabels
# Adjust the exchange labels until they no longer overlap.
#
# @returns:
#   labels: dataframe, this is a dataframe consisting of the last points
#     in the sample period for each exchange, which will are now at least
#     vertical_spacing apart on the y-axis
#
# @args (for reproducing results):
#   labels: dataframe, this is a dataframe consisting of the last points
#     in the sample period for each exchange. These points identify where
#     to place the labels.
#   vertical_spacing: float, a constant that specifies minimum vertical space
#     between labels.
#   y_variable: string, the y-variable in the plot.
#
adjust.exchangeLabels <- function(labels, vertical_spacing, y_variable){

  # Orders the labels in decreasing order of the y_variables
  # (the point for exchange with the lowest exchange_share will be first)

  labels <- labels[order(-labels[,y_variable]), ]
  # Save the number of non-NA points
  N <- sum(!is.na(labels[,y_variable]))

  # While the labels are overlapping...
  while(is.labelOverlap(labels, vertical_spacing, y_variable)){
    # For every label among the non-NA labels except the first one...
    for(i in 2:N){
      # If the current label and previous label are within vertical_spacing
      # of each other, we decrease the position of the current label on the
      # vertical axis.
      #
      # Because the indexing we use is the indexing of the original labels
      # in descending order, this algorithm "remembers" and won't change the original
      # order of the labels. The only issue is if somehow a label is pushed
      # below a lower label by a margin greater than vertical_spacing, since the
      # algorithm won't make any changes because it only cares if two points are
      # within vertical_spacing of each other. This is why when we move labels
      # we move them by vertical_spacing / 2.
      #
      if(labels[i - 1, y_variable] - labels[i, y_variable] < vertical_spacing) {
        labels[i, y_variable] <- labels[i, y_variable] - vertical_spacing / 2
      }
    }
  }
  return(labels)
}

# plot.exchangeMarketShares
# Plot exchange market shares and different intervals (daily or weekly).
#
# @returns:
#   Figure plotting the exchange shares over an observation period
#
# @args (for reproducing results):
#   data: df, a dataframe that contains the depth shares and volume shares
#     at the symbol-date(interval)-exchange level. Note: This has to be a
#	  a dataframe (and not a data.table) because otherwise we cannot extract
#	  a variable by passing its name as a string.
#   interval: string, the name of the variable plotted on the x axis
#     that identifies the date of each observation.
#   y: string, the name of the variable plotted on the y axis
#     that identifies the exchange shares.
#	color_group: string, the name of the variable that identifies which
#		observations are given the same color.
#   line_size: float, the size of each of the plotted lines.
#   label_spacing: float, the constant used to determine the size of the
#     right spacing and the vertical spacing (between each exchange label).
#   ylab: string, label for the y-axis.
#   figure_path: string, the name and directory of the figure.
#
plot.exchangeShares <- function(data,
								interval,
								y,
								color_group,
                                exchange_color_order,
                                colors,
                                ymin = NA,
                                ymax = NA,
                                background = FALSE,
                                line_size = 1.3,
                                label_spacing = 0.025,
                                ylab = "Weekly Market Share \n (Proportion of Shares Traded)",
                                figure_path){

  # Extract dates and exchange shares
  dates <- data[,interval]
  exchange_shares <- data[,y]

  # Add some spacing at the end of the plot, proportional to the length of the period
  right_spacing <- (max(dates) - min(dates)) * label_spacing

  # Vertical spacing, proportional to the height of the plot (the largest y-value)
  vertical_spacing <- max(na.omit(exchange_shares)) * label_spacing

  # Plot exchange shares
  # scale_colour_discrete(drop=TRUE, limits = exchange_color_order) +
  g <- ggplot(data, aes_string(x = interval, y = y, group = color_group, colour = color_group)) +
    geom_line(size=line_size) +
    ylab(ylab) +
    xlab("") +
    theme(axis.text.x  = element_text(size = 22),
          axis.text.y = element_text(size = 22),
          axis.title.x = element_text(size = 22),
          axis.title.y = element_text(size = 22)) +
    scale_colour_manual(drop = TRUE, values = colors) +
    scale_x_date(breaks = "1 year", date_labels = "%Y", minor_breaks = "1 year",
                 limits = c(min(dates), max(dates) + right_spacing)) +
    theme(legend.position="none")

  if(!is.na(ymax)){
      g <- g + ylim(ymin, ymax)
  }

  # Create a dataframe containing the last points in the sample period.
  # Adjust the exchange shares for these points until they are at least vertical_spacing
  # apart on the y axis.
  # These points identify where to place the labels.
  labels <- data[data[,interval] == max(dates),]
  labels <- labels[which(!is.na(labels[,y])),]
  labels <- adjust.exchangeLabels(labels, vertical_spacing, y)

  # Place labels at the points identified in labels
  g <- g + geom_text(data = labels, size = 6, aes_string(label = color_group), hjust = 0)

  #panel.border = element_rect(size = 1.5, fill = NA)
  if (background == TRUE) {
      g <- g + theme(panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.background = element_blank(),
                     axis.line = element_line(colour = "black"))
  }
  # Save the figure in the output path
  ggsave(file=figure_path,
         plot = g, width = 18, height=8,dpi=200)

}

# plot.avgSymbolExchangeSharesBox
# Plot box plots for the average
#
# @returns:
#   Figure plotting the exchange shares over an observation period.
#
# @args (for reproducing results):
#   data: df, a dataframe containing the average exchange volume shares for symbols,
#		across dates in 2015.
#	x: string, name of the variable plotted on the x-axis. In this setting,
#		this is the exchange labels ("NYSE", "NSDQ", etc.)
#	y: string, name of the variable plotted on the y-axis. In this setting,
#		this is the average volume shares.
#	xlab: string, the name of the x axis in the plot.
#	ylab: string, the name of the y axis in the plot.
#	ylim: numeric vector, the min and max of the y axis.
# width: float, width of the figure.
# height: float, height of the figure.
# figure_path: string, directory and filename for the figure.
#
plot.avgSymbolExchangeSharesBox <- function(data,
											x = "exchange",
											y = "avg_volume_sh_s",
											xlab = "",
											ylab = "Mean Market Share",
											ylim = c(0, 0.425),
											num_fig = 1,
											width = 6,
											height = 4.5,
											figure_path) {

	g <- ggplot(data, aes_string(x = x, y = y)) +
		geom_boxplot() +
		coord_cartesian(ylim = ylim) +
		labs(x = xlab, y = ylab) +
		facet_wrap(~ NYSE_label, ncol = num_fig) +
		theme(panel.spacing = unit(1, "lines"), strip.text.x = element_text(size = 12))

	ggsave(file = figure_path, plot = g, width = width, height = height, units = "in")
}

##############################################################################
#_______________________________ Load Dataset _______________________________#
##############################################################################

# Sink R print output to text
sink(file.path(home.dir, "Final_Output/Logs/Exchange_Shares_log.txt"))

# Load weekly exchange shares
weekly_ex_shares <- data.table::fread(file.path(data.dir, "Ex_Weekly_Shares_2007_2018.csv"))
# Change to Date format
weekly_ex_shares$week <- as.Date(as.character(weekly_ex_shares$week), "%Y%m%d")
# Use exchange name abbreviation instead of exchange code
weekly_ex_shares$exchange <- getAbbr(weekly_ex_shares$EX)

####################################################################################
#_______________________________ Find Top Exchanges _______________________________#
####################################################################################

cat("*** Exchange shares of Top 8, Top 5, and regional ***", "\n")
cat("\n")

# Restrict data to 2015
weekly_ex_shares_2015 <- weekly_ex_shares[year(weekly_ex_shares$week) == "2015",]

# Sum volume by exchange
# We are summing volume over dates to get the total volume at each exchange during
# the year.
ex_volume_2015 <- weekly_ex_shares_2015 %>%
                  dplyr::group_by(exchange) %>%
                  dplyr::summarise(Regular_S_Vol = sum(as.numeric(Volume_S)),
                                   Regular_D_Vol = sum(as.numeric(Volume_D)))
ex_volume_2015 <- data.table(ex_volume_2015)

# Remove exchanges with 0 share volume
ex_volume_2015 <- ex_volume_2015[!ex_volume_2015$Regular_S_Vol == 0,]
# Sum to total traded volume for all weeks (across exchanges)
ex_volume_2015$Total_Regular_S_Vol <- sum(ex_volume_2015$Regular_S_Vol, na.rm = TRUE)
# Order the exchanges in decreasing order of volume.
ex_reg_volume_2015 <- ex_volume_2015[order(-ex_volume_2015$Regular_S_Vol),]
# Generate volume traded as a share of total volume for each exchange.
ex_reg_volume_2015$Ex_Sh_S_Regular <- ex_reg_volume_2015$Regular_S_Vol / ex_reg_volume_2015$Total_Regular_S_Vol

# Top 5 Maker-Takers
top_5_regular <- ex_reg_volume_2015[1:5]$exchange

print("The Top 5 Maker-Taker exchanges are:")
top_5_regular
cat("\n")

print("The share of volume in the Top 5 exchanges is:")
sum(ex_reg_volume_2015[ex_reg_volume_2015$exchange %in% top_5_regular,]$Ex_Sh_S_Regular)
cat("\n")

# Top 8 Exchanges
top_8_regular <- ex_reg_volume_2015[1:8]$exchange

print("The Top 8 exchanges are:")
top_8_regular
cat("\n")

print("The share of volume in the Top 8 exchanges is:")
sum(ex_reg_volume_2015[ex_reg_volume_2015$exchange %in% top_8_regular,]$Ex_Sh_S_Regular)
cat("\n")

# Taker-Maker Exchanges
top_taker_maker <- setdiff(top_8_regular, top_5_regular)

print("The taker-maker exchanges are: ")
top_taker_maker
cat("\n")

print("The share of volume in the Taker-Maker exchanges is:")
sum(ex_reg_volume_2015[ex_reg_volume_2015$exchange %in% top_taker_maker,]$Ex_Sh_S_Regular)
cat("\n")

# Regional Exchanges
regional_ex <- ex_reg_volume_2015[9:nrow(ex_reg_volume_2015)]$exchange

print("The regional exchanges are:")
regional_ex
cat("\n")

print("The share of volume in the regional exchanges is:")
sum(ex_reg_volume_2015[ex_reg_volume_2015$exchange %in% regional_ex,]$Ex_Sh_S_Regular)
cat("\n")

######################################################################
#_______________________________ Plot _______________________________#
######################################################################

# Weekly exchange shares from 2007 to 2015
weekly_ex_shares_2007_2015 <- weekly_ex_shares[year(weekly_ex_shares$week) <= "2015",]
weekly_ex_shares_2007_2015 <- data.frame(weekly_ex_shares_2007_2015)

# Color order
exchange_priority <- c("ARCA", "BZX", "EDGX", "NSDQ", "NYSE", "EDGA", "BYX", "BX")
colors       <- c("purple", "green3", "#619CFF", "forest green", "navy","maroon", "orange1", "palevioletred1")

weekly_ex_shares_2007_2015$exchange <- factor(weekly_ex_shares_2007_2015$exchange, levels = exchange_priority)

##############################################################################################
#_______________________________ Exchange Shares, 2011 - 2015 _______________________________#
##############################################################################################

# Plot for the Top 5 exchanges
# NOTE: In order for the color scheme to be preserved, we create a new column
# with the shares defined only for the Top 5, instead of filtering other exchanges out.

#
# Note about exchange start dates:
# EDGA started showing up on the system on "2010-07-19"
# EDGX started showing up on the system on "2010-07-19"
# BATS started showing up on the system on "2008-10-28" (only for a few symbols)
#   but most symbols appeared on "2008-11-05"
# BATS-Y started showing up on the system on "2010-10-15" for select few
#   symbols but finished rolling out all symbols on "2010-10-22"
#

# Restrict weekly exchange shares to Top 8 exchanges (as determined for 2015)
weekly_ex_shares_2007_2015_Top8 <- weekly_ex_shares_2007_2015[weekly_ex_shares_2007_2015$exchange %in% top_8_regular,]

# Weekly exchange shares from 2011 to 2015
weekly_ex_shares_2011_2015 <- weekly_ex_shares_2007_2015[year(weekly_ex_shares_2007_2015$week) >= "2011",]
weekly_ex_shares_2011_2015 <- data.frame(weekly_ex_shares_2011_2015)
# Restrict weekly exchange shares to Top 8 exchanges
weekly_ex_shares_2011_2015_Top8 <- weekly_ex_shares_2011_2015[weekly_ex_shares_2011_2015$exchange %in% top_8_regular,]

plot.exchangeShares(weekly_ex_shares_2011_2015_Top8,
                          interval = "week", y = "Regular_Sh_S",
                          color_group = "exchange",
                          ymin = 0,
                          ymax = 0.3,
                          exchange_color_order = ex_color,
                          colors = colors,
                          background = FALSE,
                          figure_path = file.path(fig.dir,"weekly_shares_2011_2015_Top8.png"))

#######################################################################################################
#_______________________________ Exchange Fixed Effects (Daily shares) _______________________________#
#######################################################################################################

cat("*** Aggregate exchange shares regression (s_jt) ***", "\n")
cat("\n")

# Load daily exchange shares
daily_ex_shares <- data.table::fread(file.path(data.dir, "Ex_Daily_Shares_2007_2018.csv"))

# Format date and exchange
daily_ex_shares$date <- as.Date(as.character(daily_ex_shares$DATE), "%Y%m%d")
daily_ex_shares$exchange <- getAbbr(daily_ex_shares$EX)

# Daily Exchange shares from 2011 to 2015
daily_ex_shares_2011_2015 <- daily_ex_shares[year(daily_ex_shares$date) >= "2011" & year(daily_ex_shares$date) <= "2015",]

# Restrict daily exchange shares to Top 8 Exchanges
daily_ex_shares_2011_2015_top8 <- daily_ex_shares_2011_2015[daily_ex_shares_2011_2015$exchange %in% top_8_regular,]
daily_ex_shares_2011_2015_top8 <- data.frame(daily_ex_shares_2011_2015_top8)

# Calculate R2
print("R2 of the regression of exchange fixed effects on aggregate exchange volume shares (s_jt) for the top 8 exchange.")
R2_ex_shares_top8 <- regression.R2(daily_ex_shares_2011_2015_top8, "exchange", "Regular_Sh_S")
R2_ex_shares_top8
cat("\n")

################################################################################################################
#_______________________________ Symbol-Date Exchange Shares (s_ijt regression) _______________________________#
################################################################################################################

cat("*** Symbol-level exchange shares regression (s_ijt) ***", "\n")
cat("\n")

# Load data
depth_volume_top8 <- data.table::fread(file.path(data.dir, "Sym_Ex_Date_Shares_2015.csv"))

# Format date and exchange
depth_volume_top8$DATE <- as.Date(as.character(depth_volume_top8$DATE), "%Y%m%d")
depth_volume_top8$exchange <- getAbbr(depth_volume_top8$EX)

# How much volume is in the top 100?
top100_volume <- sum(depth_volume_top8[depth_volume_top8$f_Sample_T100 == 1,]$Volume_S, na.rm = TRUE)
all_volume <- sum(depth_volume_top8$Volume_S, na.rm = TRUE)

pct_volume_in_top100 <- (top100_volume / all_volume) * 100
pct_volume_in_top100 <- format(round(pct_volume_in_top100, 2), 2)
print(paste0("The Top 100 constitute ", pct_volume_in_top100, "% of all volume"))
cat("\n")

# Find the R2 of the regression between s_ijt and exchange fixed effects + NYSE Listed indicator.
# For the top 100 symbols in sample 1

depth_volume_top8_t100 <- depth_volume_top8[depth_volume_top8$f_Sample_T100 == 1 & depth_volume_top8$exchange %in% top_8_regular,]
depth_volume_top8_t100 <- data.frame(depth_volume_top8_t100)

print("The R2 of the regression of daily exchange volume shares on exchange fixed effects and the NYSE listed indicator,")
print("among the Top 100 symbols in our sample and the Top 8 exchanges is as follows.")
r2_Top8_t100 <- regression.R2(depth_volume_top8_t100, c("exchange", "exchange:f_NYSE_Listed"), "Volume_Sh_S")
r2_Top8_t100
cat("\n")

# Close sink
sink()
